
### Project: IADB Government Payroll Analytics - Country
### Project leader: Dr Christian Schuster
### Code author (s): Robert Lipiński
### Date last update: (run below)
file.info(rstudioapi::getActiveDocumentContext()$path)$mtime

### Script purpose: filtering out organizations that either for formal reasons, like the intended coverage of the dashboard or the payroll data,
### or informal ones, like irregular data coverage (to be determined below as unexplained drops in the counts of org-year and org-months number
### of observations), should not be shown in the dashboard. Given that determining the latter requires using unique personal ID, this
### script needs to be run after the one that cleans the IDs ('limpiar ID')

### Execution time: ~10 minutes

### Inputs: 
# 1) /data/intermediate/country_06_limpiar_id.[format1]


### Outputs:
# 1) /data/intermediate/country_07_limpiar_cubertura.[format1]

# *) /data/raw_qs/intermediate_temp/country_06_limpiar_id (temp1).parquet [only temporary file to avoid re-running full script in case of an error
# not necessary for executing the script]
# *) data/clean/aggregates/multi_payments_certain.parquet (list of almost certain instances of people paid >1 time a month)
# *) wdata/clean/aggregates/multi_payments_suspect.parquet (list of less certain instances of people paid >1 time a month)



#
# SET-UP --------------------------------------------------------------------------------------------
#

### Source the '00_global.R' script with required packages and functions
source(file.path(dirname(rstudioapi::getActiveDocumentContext()$path), '00_country_global.R'))


# library(installr)
# updateR()

# Make a copy of the file
file.copy(rstudioapi::getSourceEditorContext()$path,
          gsub('code', 'code/00_ARCHIVE', gsub('\\.R', ' - copy.R', rstudioapi::getSourceEditorContext()$path)),
          overwrite = T, copy.date = T)

#
# READ DATA -------------------------------------------------------------------------------------------------------------------
#
t0 = Sys.time() # record start time

# get columns used in this script

col_names = names(open_dataset(file.path(main_dir, 'data', 'intermediate', "country_06_limpiar_id.parquet")))

col_select1  = col_names[sapply(col_names, function(c) any(grepl(c, tolower(readLines(rstudioapi::getSourceEditorContext()$path)))))]

country = read_flex(file.path(main_dir, 'data', 'intermediate', "country_06_limpiar_id"), format = format1, col_select = col_select1)

## set as DT if not already done
if(!any(grepl('data.table', class(country)))){setDT(country)}
gc()


### leave unique org-anyo combinations --------------------------------------------------------------------------------
# NOTE: we only want to keep org-year combinations that occur in the dataset after filtering those
# combinations with <10 unique IDs (see script 06). Otherwise the complete() function in the 
# summary function below will extend the values to combinations we don't want


n_min = 10 # Christian email from 29/07/2025 (11:42) -> "let's filter out anyone with fewer than 10 unique IDs in a year"
country[, org_id_n := uniqueN(person_id), by = .(anyo, organismo_nombre_clean)]
sf(country$org_id_n < 10)
beep()

country = country[org_id_n >= n_min, ]

#### (temp) EXCLUDE legislative/audit/security institutions ----------------------------------------------------------------------------------------------------------------------
# that should be several institutions and ~ 1.1% of all obs
org_exclude <- paste(c(
  'senado$',
  'camara de diputados', 'parlamentaria',
  'biblioteca del congreso',
  'secretaria general de la presidencia de la republica',
  
  'servicio de impuestos internos',
  'contraloria general de la republica',
  
  'judicial', 'justicia', 'fiscalia',
  
  'agencia nacional de inteligencia', 'gendarmeria',
  'carabineros', 'militar', 'armada', 'fuerza aerea',
  'estado mayor conjunto', # 'joint chiefs'
  'direccion general de movilizacion nacional'
),
collapse = "|")

# country = country[!grepl(org_exclude, clean_text(organismo_nombre_clean), perl = TRUE)]
org_temp = unique(country[, .(organismo_codigo, organismo_nombre_clean)]) 
org_temp = org_temp[,organismo_nombre_clean := clean_text(organismo_nombre_clean)]
org_temp = org_temp[, org_exclude := grepl(org_exclude, organismo_nombre_clean)]
org_temp$org_exclude %>% sf


### manually override exclusion for some organizations
## Christian 05/08/2025: For the survey reports: the first three are in (DIPRECA, Fiscalia de Obras Publicas, Fiscal Nacional Economica),
# GENCHI and the Subsecretaria de las Fuerzas Armadas are out.
# We also exclude GENCHI from the dashboard. 
# We retitle Subsecretaria de las Fuerzas Armadas to Subsecretaria de las Fuerzas Armadas (sin uniformados) in the payroll dashboard.

org_temp$org_exclude[org_temp$organismo_nombre_clean == 'subsecretaria para las fuerzas armadas (ssffaa)'] = F
org_temp$org_exclude[org_temp$organismo_nombre_clean == 'direccion de prevision de carabineros de country (dipreca)'] = F
org_temp$org_exclude[org_temp$organismo_nombre_clean == 'fiscalia nacional economica (fne)'] = F
org_temp$org_exclude[org_temp$organismo_nombre_clean == 'fiscalia de obras publicas'] = F

# sort(org_temp$organismo_nombre_clean[org_temp$org_exclude])
# org_temp$organismo_nombre_clean[org_temp$org_exclude]



country[, organismo_nombre_clean:=NULL]
country = org_temp[country, on = c('organismo_codigo')]

# change ssffaa name
country$organismo_nombre_clean[country$organismo_nombre_clean == 'subsecretaria para las fuerzas armadas (ssffaa)'] = tolower('Subsecretaria de las Fuerzas Armadas (sin uniformados)')


dim(country)
fdistinct(country$organismo_nombre_clean)
beep()

country = country[org_exclude == F,]
fdistinct(country$organismo_nombre_clean)
# country[, org_exclude := NULL]


### remove obs without organismo_nombre_clean (shouldn't be any at this point)
country = country[!is.na(organismo_nombre_clean),]


### create date 'anyo_mes' column if not present
if(any(grepl('anyo_mes', names(country)))){
  if(class(country$anyo_mes) == 'Date'){
    print('Date column already present')
  }
}else{
  print('Adding date column')
  country = country[, anyo_mes := ymd(paste(anyo, mes, '01', sep='-'))]
}



## any org-anyo combinations <10?
# n_min = 10 # Christian email from 29/07/2025 (11:42) -> "let's filter out anyone with fewer than 10 unique IDs in a year"
# country[, org_id_n := uniqueN(person_id), by = .(anyo, organismo_nombre_clean)]
sf(country$org_id_n < 10)

## security etc organizations out for good?
org1 = funique(country$organismo_nombre_clean)
sf(grepl('estado mayor conjunto', org1))
sf(grepl('í', org1)) # Spanish diacritics (í the most common)

sf(grepl('direccion de prevision de carabineros de country', org1)) # in
sf(grepl('fiscalia nacional economica', org1)) # in
sf(grepl('fiscalia de obras publicas', org1)) # in 
sf(grepl('subsecretaria de las fuerzas armadas', org1)) # in
sf(grepl('genda', org1)) # out



#### > (temp) save ---------------------------------------------------------------------------------------------------------------------
gc()
write_flex(x = country, file.path(main_dir, 'data', 'intermediate_temp', "country_07_limpiar_cubertura (temp1)"), format = format1)
gc()



# ' ---------------------------------------------------------------------------------------------------------------------------------------------------
# DATES OF OBS -------------------------------------------------------------------------------------------------------------------------------------
#

### + day_diff ----------------------------------------------------------------------------------------------------------- 
# country = read_flex(file.path(main_dir, 'data', 'intermediate_temp', "country_07_limpiar_cubertura (temp1)"), format = format1)

### <> filter cols to limit obs. size -----------------------------------------------------------------------------------------------------------------

country_cubertura = country[, .(row_id_org, person_id, anyo, anyo_mes, 
                            dataset, organismo_nombre_clean)] # leave only desired columns

gc()


anew_day_diff = T

if(anew_day_diff){
  
  country_cubertura_id = unique(country_cubertura[, .(person_id, organismo_nombre_clean, anyo_mes)]) # leave only desired columns
  
  # temp = country_cubertura_id[, uniqueN(anyo_mes), by = person_id]
  # temp$V1 %>% pr
  
  
  # Make sure data is sorted
  setkey(country_cubertura_id, person_id, anyo_mes)
  
  # Calculate month differences
  country_cubertura_id[, day_lag := as.numeric(difftime(anyo_mes, shift(anyo_mes, type = "lag"), units = "days")),
                    by = person_id]
  country_cubertura_id[, day_lead := as.numeric(difftime(shift(anyo_mes, type = "lead"), anyo_mes , units = "days")), 
                    by = person_id]
  
  
  # NOTE: setkey(country_cubertura_id, person_id, organismo_nombre_clean, anyo_mes)
  
  setkey(country_cubertura_id, person_id, organismo_nombre_clean, anyo_mes)
  
  country_cubertura_id[, day_lag_org := as.numeric(difftime(anyo_mes, shift(anyo_mes, type = "lag"), units = "days")), 
                    by = .(person_id, organismo_nombre_clean)]
  country_cubertura_id[, day_lead_org := as.numeric(difftime(shift(anyo_mes, type = "lead"), anyo_mes, units = "days")), 
                    by = .(person_id, organismo_nombre_clean)]
  
  tapply(country_cubertura_id$day_lead, country_cubertura_id$anyo, summary)
  tapply(country_cubertura_id$day_lag, country_cubertura_id$anyo, summary)
  
  tapply(country_cubertura_id$day_lag_org, country_cubertura_id$anyo, summary)
  tapply(country_cubertura_id$day_lead_org, country_cubertura_id$anyo, summary)
  
  # tapply(country_cubertura_id$day_lead > 364, country_cubertura_id$anyo, summary)   
  # pr_na(country_cubertura_id$day_lead > 31)
  # cor(country_cubertura_id$day_lag_org, country_cubertura_id$day_lag, use='complete.obs')
  
  write_flex(x = country_cubertura_id, file.path(main_dir, 'data', 'intermediate_temp', "country_07_limpiar_cubertura_id (temp1)"), format = format1)
  gc()
  
}else{
  country_cubertura_id = read_flex(file.path(main_dir, 'data', 'intermediate_temp', "country_07_limpiar_cubertura_id (temp1)"), format = format1)
}
  


country_cubertura[, names(country_cubertura)[startsWith(names(country_cubertura), "day_diff")] := NULL]
country_cubertura = country_cubertura_id[country_cubertura, on = .(person_id, organismo_nombre_clean, anyo_mes)]



### + org appear dates -------------------------------------------------------------------------------------------------------------
### define appearance date of organizations (note, this needs to be done only AFTER
### the organizations with <10 people/year are filtered out, which should have been done above)
org_appear_df = unique(country_cubertura[, .(organismo_nombre_clean, anyo_mes)])

org_appear_df[, org_appear_date := min_miss(anyo_mes), by = .(organismo_nombre_clean)]
org_appear_df[, org_disappear_date := max_miss(anyo_mes), by = .(organismo_nombre_clean)]

# org_appear_df = unique(org_appear_df[, .(organismo_nombre_clean, org_appear_date, org_disappear_date)])

### all dates (months) organization observed (following filtering above)
setorder(org_appear_df, organismo_nombre_clean, anyo_mes)


org_appear_df[, anyo_mes_org_lag := as.numeric(difftime(anyo_mes, shift(anyo_mes, type = "lag"), units = "days")),
              by = .(organismo_nombre_clean)]
org_appear_df[, anyo_mes_org_lead := as.numeric(difftime(shift(anyo_mes, type = "lead"), anyo_mes, units = "days")),
              by = .(organismo_nombre_clean)]

### combine
country_cubertura[, c("cubertura", "org_appear_date", "org_disappear_date") := NULL]
country_cubertura = org_appear_df[country_cubertura,  on = .(organismo_nombre_clean, anyo_mes)]
names(country_cubertura)



# ' ----------------------------------------------------------------------------------------------------------------------------------------------------
# ORG-YEAR FILTER --------------------------------------------------------------------------------------------------------------------------------------
#



### appear [hire] (temp) -----------------------------------------------------------------------------------------------------------------------------------
### NOTE: this variable will be re-calculated after filtering, but here we need it for the first condition - organizational records
### will be deemed too shaky to keep if, among other conditions, >50% employees are newly HIRED


country_cubertura[, c("id_mes_min", "id_mes_max") := .(
  min(anyo_mes, na.rm = TRUE),
  max(anyo_mes, na.rm = TRUE)
), by = person_id]



# code the first appearance after initial date (>= 1 month after start_date1)
country_cubertura[, id_appear := fifelse(test = (
                                              (!is.na(id_mes_min) & # if no-missing min day
                                              (id_mes_min - org_appear_date >= 28) &  # if first month at least >1 month before org first appears (which equals forst month of data for most organizations)
                                              anyo_mes == id_mes_min) |  
                                            (!is.na(day_lag) & day_lag > max_miss(c(364, anyo_mes_org_lag)))   # OR if LAGGING day difference min. 1 year
                                      ),
                                      yes = 1,
                                      no  = 0)]

### find undesired org-year combinations  -------------------------------------------------------------------------------------------------------------
# NOTE: Two conditions have to be met to filter org-year combination:
# 1) We drop the whole year for an organisation if, relative to a subsequent year, 20% or fewer observations of the total are observed
# (see Christian's email from 31/07/2025 16:48)
# 2) >50% of people are new public sector hires (Robert's response to the above email from 31/07/2025, 18:19)


# get unique counts of IDs per each org-year combination (including new hires - both public sector overall and organization only)
org_id = unique(country_cubertura[, .(n = uniqueN(person_id),
                                    n_hire = uniqueN(person_id[id_appear == 1])
                                  ), by = .(anyo, organismo_nombre_clean)])

# arrange in order
setorder(org_id, organismo_nombre_clean, anyo)

# what % of next year observations is the present one?
org_id[, n_lead_pc :=  n/shift(n, type = "lead"), by = .(organismo_nombre_clean)]

# what % of people newly hired (in the public sector) in the SUBSEQUENT(!) org-year combo?
org_id[, n_hire_lead := shift(n_hire, type = "lead") / shift(n, type = "lead")]

### remove a given row (org-year combination)?
org_id[, remove :=  fifelse(!(n_lead_pc > .2 | is.na(n_lead_pc)), # remove NEITHER min 20% of current no. of IDs in next year nor lead values missing in next year (meaning it's final year)
                            'yes', 'no')]
org_id[, remove2 :=  fifelse(!(n_lead_pc > .2 | is.na(n_lead_pc)) & n_hire_lead > 0.5, # as above, but require that at least 50% people are newly hired
                             'yes', 'no')]



## >checks - are there any orgs where >5x jumps happen more than once? R: there shouldn't be
org_id[, remove_n := sum(remove == 'yes'), by = organismo_nombre_clean]
sf(org_id$remove_n)



### filter out anything before the jump too, as otherwise strange gaps appear that would also fulfill the above condition
### (e.g. there are 50 employees in 2022 and 60 in 2023 and then 1,200 in 2024 - the above cannot only filter out 2023, as then
### the jump from 2022 to 2024 would still be too large. So filter anything BEFORE 2023, at least in this example)
org_id[, remove := fifelse(anyo <= max_miss(anyo[remove == 'yes']), 'yes', 'no'), by = .(organismo_nombre_clean)]
org_id[, remove2 := fifelse(anyo <= max_miss(anyo[remove2 == 'yes']), 'yes', 'no'), by = .(organismo_nombre_clean)]

table(org_id$remove, org_id$remove2)



### OVER-WRITE REMOVAL MANUALLY
# instituto nacional de estadisticas (ine) -> fits the filtering condition above, so manually over-write it
org_id$remove[org_id$organismo_nombre_clean == 'instituto nacional de estadisticas (ine)'] = 'no'
org_id$remove2[org_id$organismo_nombre_clean == 'instituto nacional de estadisticas (ine)'] = 'no'


unique(org_id[remove2 == "no", .(anyo, organismo_nombre_clean)])
unique(org_id[remove2 == "yes", .(anyo, organismo_nombre_clean)])


### < remove lablled org-year combinations  ---------------------------------------------------------------------------------------------------------------------
dim(country_cubertura)
country_cubertura <- org_id[remove2 == "no", .(anyo, organismo_nombre_clean)][country_cubertura, on = .(anyo, organismo_nombre_clean), nomatch = 0]
dim(country_cubertura)






# ' ----------------------------------------------------------------------------------------------------------------------------------------------------
# ORG-MONTH FILTER --------------------------------------------------------------------------------------------------------------------------------------
#


# country_cubertura = read_flex(file.path(main_dir, 'data', 'intermediate_temp', "country_07_limpiar_cubertura (temp3)"), format = format1)
# gc()


org_mes = unique(country_cubertura[, .(n = uniqueN(person_id)), by = .(anyo, anyo_mes, organismo_nombre_clean)])

setorder(org_mes, organismo_nombre_clean, anyo, anyo_mes)

### subset to org-mes combinations that fall on after 10 IDs first recorded
org_mes[, anyo_mes_start := min_miss(anyo_mes[n >= 10]), by = .(organismo_nombre_clean)]


org_mes = org_mes[, remove := fifelse(!(anyo_mes>=anyo_mes_start & !is.na(anyo_mes_start)), 1, 0)] # exclude also NAs (means the organization never reaches 10 unique IDs per month)
org_mes = org_mes[remove == 0, ]



## compare to N to mean N in a give org (overall)?
org_mes[, n_mean := mean_miss(n), by = .(organismo_nombre_clean)]

# org_mes[, n_pc6 := n/n6_mean]
# org_mes[, n_pc12 := n/n_mean_anyo]
org_mes[, n_pc_mean := n/n_mean]

sf(org_mes$n_pc_mean<.25)


# fdistinct(org_mes$organismo_nombre_clean[org_mes$n_pc6 < .25])
# fdistinct(org_mes$organismo_nombre_clean[org_mes$n_pc12 < .25])
fdistinct(org_mes$organismo_nombre_clean[org_mes$n_pc_mean < .25])



### do (anti-)join
### Christian's email from 02/08/2025 (16:46) -> supres ORG-MONTHS, not org-years
org_mes[, remove2 := fifelse(n_pc_mean < .25, 1, 0)]
org_mes[, remove3 := fifelse(n_mean < 10, 1, 0)]


# org1 = unique(org_mes[remove2 == 1 | remove3 == 1, .(organismo_nombre_clean, anyo_mes)])
# org_mes$remove3 %>% sf
# country_cubertura2 = country_cubertura[!org1, on = .(organismo_nombre_clean, anyo_mes)]


### checked to work ok (as left_join would)
country_cubertura = org_mes[, .(anyo_mes, organismo_nombre_clean, remove2, remove3)][country_cubertura, on = .(organismo_nombre_clean, anyo_mes)]


### > checks - somes organizatiosn that should have months labelled for removal
temp = country_cubertura[grepl('seremi de salud de aysen', organismo_nombre_clean),]
temp = country_cubertura[grepl('servicio local de educacion publica de magallanes', organismo_nombre_clean),]
temp = country_cubertura[grepl('municipalidad de curanilahue', organismo_nombre_clean),]
temp = country_cubertura[grepl('agencia de calidad de la educacion', organismo_nombre_clean),]
hist(sf(temp$anyo_mes))
tapply(temp$anyo_mes, (temp$remove2 + temp$remove3)>0, sf)



### < remove the labelled org-months combinations from the file ----------------------------------------------------------------------------------------------------
dim(country_cubertura)
dim(country_cubertura[remove2 == 0 & remove3 == 0,])

country_cubertura = country_cubertura[remove2 == 0 & remove3 == 0,]
country_cubertura[, c('remove2', 'remove3') := NULL]
# nrow(country_cubertura2)/nrow(country_cubertura)
# nrow(country_cubertura2)/nrow(country)



# > (temp) save --------------------------------------------------------------------------------------------------------------
gc()
write_flex(x = country_cubertura, file.path(main_dir, 'data', 'intermediate_temp', "country_07_limpiar_cubertura (temp3)"), format = format1)
# gc()


# ' ---------------------------------------------------------------------------------------------------------------------------------------------------
# [RE-RUN] DATES OF OBS -------------------------------------------------------------------------------------------------------------------------------------
#

### + day_diff ----------------------------------------------------------------------------------------------------------- 
# country_cubertura = read_flex(file.path(main_dir, 'data', 'intermediate_temp', "country_07_limpiar_cubertura (temp3)"), format = format1)


anew_day_diff = T

if(anew_day_diff){
  
  country_cubertura_id = unique(country_cubertura[, .(person_id, organismo_nombre_clean, anyo_mes)]) # leave only desired columns
  
  # temp = country_cubertura_id[, uniqueN(anyo_mes), by = person_id]
  # temp$V1 %>% pr
  
  
  # Make sure data is sorted
  setkey(country_cubertura_id, person_id, anyo_mes)
  
  # Calculate month differences
  country_cubertura_id[, day_lag := as.numeric(difftime(anyo_mes, shift(anyo_mes, type = "lag"), units = "days")),
                     by = person_id]
  country_cubertura_id[, day_lead := as.numeric(difftime(shift(anyo_mes, type = "lead"), anyo_mes , units = "days")), 
                     by = person_id]
  
  
  # NOTE: setkey(country_cubertura_id, person_id, organismo_nombre_clean, anyo_mes)
  
  setkey(country_cubertura_id, person_id, organismo_nombre_clean, anyo_mes)
  
  country_cubertura_id[, day_lag_org := as.numeric(difftime(anyo_mes, shift(anyo_mes, type = "lag"), units = "days")), 
                     by = .(person_id, organismo_nombre_clean)]
  country_cubertura_id[, day_lead_org := as.numeric(difftime(shift(anyo_mes, type = "lead"), anyo_mes, units = "days")), 
                     by = .(person_id, organismo_nombre_clean)]
  
  tapply(country_cubertura_id$day_lead, country_cubertura_id$anyo, summary)
  tapply(country_cubertura_id$day_lag, country_cubertura_id$anyo, summary)
  
  tapply(country_cubertura_id$day_lag_org, country_cubertura_id$anyo, summary)
  tapply(country_cubertura_id$day_lead_org, country_cubertura_id$anyo, summary)
  
  # tapply(country_cubertura_id$day_lead > 364, country_cubertura_id$anyo, summary)   
  # pr_na(country_cubertura_id$day_lead > 31)
  # cor(country_cubertura_id$day_lag_org, country_cubertura_id$day_lag, use='complete.obs')
  
  write_flex(x = country_cubertura_id, file.path(main_dir, 'data', 'intermediate_temp', "country_07_limpiar_cubertura_id (temp2)"), format = format1)
  gc()
  
}else{
  country_cubertura_id = read_flex(file.path(main_dir, 'data', 'intermediate_temp', "country_07_limpiar_cubertura_id (temp2)"), format = format1)
}

### over-rise the names
country_cubertura[, names(country_cubertura)[startsWith(names(country_cubertura), "day_")] := NULL]
country_cubertura[, c('org_appear_date', 'org_disappear_date') := NULL]

country_cubertura = country_cubertura_id[country_cubertura, on = .(person_id, organismo_nombre_clean, anyo_mes)]



### + org appear dates -------------------------------------------------------------------------------------------------------------
### define appearance date of organizations (note, this needs to be done only AFTER
### the organizations with <10 people/year are filtered out, which should have been done in the previous
### scropts (06))
org_appear_df = unique(country_cubertura[, .(organismo_nombre_clean, anyo_mes)])

org_appear_df[, org_appear_date := min_miss(anyo_mes), by = .(organismo_nombre_clean)]
org_appear_df[, org_disappear_date := max_miss(anyo_mes), by = .(organismo_nombre_clean)]

org_appear_df = unique(org_appear_df[, .(organismo_nombre_clean, org_appear_date, org_disappear_date)])


### combine
country_cubertura[, cubertura := NULL]
country_cubertura = org_appear_df[country_cubertura, on = .(organismo_nombre_clean)]



# ' -------------------------------------------------------------------------------------------------------------------------------------------
# CUBERTURA  --------------------------------------------------------------------------------------------------------------------------------

entidad_anyo = country_cubertura %>% fgroup_by(organismo_nombre_clean) %>% fsummarise(n = fdistinct(anyo)) # get orgs by number of years covered
entidad_anyo = entidad_anyo %>% mutate(cubertura = ifelse(n == length(start_year:end_year), 'completo', 'incompleto')) # if all years covered cubertuta = completo

country_cubertura[, cubertura := NULL]
country_cubertura = entidad_anyo[,c('n'):=NULL][country_cubertura, on = .(organismo_nombre_clean)]
pr(country_cubertura$cubertura) # R: incompleto should be ~8.4% of all observations across 2019-2024 period now



# ' ----------------------------------------------------------------------------------------------------------------------------------------------------------------------
# > FINAL SAVE  ---------------------------------------------------------------------------------------------------------------------------------------------------------
# country_cubertura = read_flex(file.path(main_dir, 'data', 'intermediate_temp', "country_07_limpiar_cubertura (temp4)"), format = format1)
# gc()

### combine with full --------------------------------------------------------------------------------------------------------------------
country_cubertura = country_cubertura %>% select(c(row_id_org, person_id, cubertura,  org_appear_date, org_disappear_date, matches('^day')))

country_save = read_flex(file.path(main_dir, 'data', 'intermediate', "country_06_limpiar_id"), format = format1)

setindex(country_save, row_id_org)
setindex(country_cubertura, row_id_org)
country_save = country_cubertura[country_save, on = 'row_id_org', nomatch=0]
beep()

head(country_cubertura)
dim(country_save)
fdistinct(country_save$row_id_org)
sf(is.na(country_cubertura$row_id_org))


### save ---------------------------------------------------------------------------------------------------------------------------------------------------------------
write_flex(x = country_save, file.path(main_dir, 'data', 'intermediate', "country_07_limpiar_cubertura"), format = format1)
gc()

beep('complete')

exec_time_fun('exec_time')


#
# FIN DEL CÓDIGO  --------------------------------------------------------------------------------------------
# 